######################################
# Media Measurement Matters          #
# Replication Code                   #
# Appendix S: Financial Domains      #
######################################

# The following file contains code for replicating the sensitivity analyses
# presented in Appendix S. Appendix S examines the robustness of our results 
# to the inclusion of financial domains.

# Caution: the code in this file takes some time to run!

# Set-Up ----

# If desired, set up path into which to save plots and tables
plot_path <- NULL
table_path <- NULL

# Load packages
library(tidyverse)
library(ggridges)
library(ggpubr)
library(estimatr)
library(overlap)
library(gtools)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Set up colors
red_mit = '#A31F34'
red_light = '#A9606C'
blue_mit = '#315485'
grey_light= '#C2C0BF'
grey_dark = '#8A8B8C'
black = '#353132'

# Source helper functions
source("helper_functions.R")

# > Read in data ----

# Read in survey data
srvy <- read_rds("data/survey_data_cleaned.rds")

# Read in web data
web <- read_rds("data/web_use.rds")

# Sensitivity to financial domains ----

# Examine alignment scores for commonly visited financial domains
bma <- read_csv("data/bakshy_top500.csv")

bma %>% 
  filter(str_detect(domain, "www.wsj|www.forbes|www.reuters")) %>% 
  group_by(domain) %>% 
  summarise(score = mean(avg_align)) %>% 
  as.data.frame()

# > Prepare simulated data ----

# Number of visits to each domain in our comScore data that could be matched
# to the BMA scores
visits <- web %>% 
  drop_na(avg_align) %>% 
  group_by(domain_recode) %>% 
  summarise(visits = n(), avg_align = mean(avg_align)) %>% 
  arrange(desc(visits))

# Estimate number of WSJ visits we would have observed ub iyr data, based on Guess (2021): 
# https://dataverse.harvard.edu/file.xhtml?fileId=4150519&version=1.0
guess_sites <- read_csv("data/guess_sites2016.csv") %>% 
  rename(domain_recode = domain,
         guess_visits = visits) 

n_wsj <- guess_sites %>% filter(domain_recode == "wsj.com") %>% pull(guess_visits)
align_wsj <- guess_sites %>% filter(domain_recode == "wsj.com") %>% pull(avg_align)

# Calculate proportion of visits to WSJ (vs. another domain) in the Guess (2021) data
guess_sites <- guess_sites %>% 
  mutate(prop_wsj = n_wsj/guess_visits)

# Estimate WSJ visits in our data vs. Guess (2021), assuming that visits to WSJ are
# similarly proportional to visits to other domains in our data
visits <- visits %>% 
  left_join(guess_sites %>% select(domain_recode, guess_visits, prop_wsj), 
            by = "domain_recode") %>% 
  mutate(n_wsj = round(visits * prop_wsj, 0)) 

# Use CNN as the benchmark for the number of domain visits to allocate
n_visits <- visits %>% filter(domain_recode == "cnn.com") %>% pull(n_wsj)

# > Allocate visits by stated preferences ----

# Calculate the polarization of visits to each domain by stated media preferences
# (Fox vs. MSNBC) in our web-browsing data
visit_polar <- web %>% 
  drop_na(avg_align) %>% 
  filter(med_pref != "Entertainment") %>% 
  group_by(domain_recode, med_pref) %>% 
  tally() %>% 
  pivot_wider(names_from = med_pref, values_from = n) %>% 
  rename(msnbc = MSNBC, fox = Fox) %>% 
  mutate(msnbc = coalesce(msnbc, 0), fox = coalesce(fox, 0),
         denom = fox + msnbc, prop_msnbc = msnbc/denom, prop_fox = fox/denom) %>% 
  arrange(desc(prop_msnbc))

# Examine level of audience polarization for Breitbart and Fox News
visit_polar %>% 
  filter(domain_recode %in% c("foxnews.com", "breitbart.com")) %>% 
  select(domain_recode, prop_fox, prop_msnbc)

# Assume (conservatively) that visits to WSJ were *more* polarized than visits
# to either of these sites (90/10 split for Fox/MSNBC preferrers)
msnbc_visits <- round(n_visits * 0.1)
fox_visits <- round(n_visits * 0.9)

msnbc_visits + fox_visits; n_visits 
  # Note: due to rounding, include one additional site visit in allocation

# Determine number of WSJ visits (of 15,016) to allocate to each respondent
resp_visits <- web %>% 
  filter(med_pref != "Entertainment") %>% 
  group_by(resp_id) %>% 
  summarise(med_pref = unique(med_pref),
            count = n()) %>%  # N of news visits for each respondent
  left_join(web %>% 
              filter(med_pref != "Entertainment") %>% 
              group_by(med_pref) %>% 
              summarise(total = n())) %>% # N of news visits for each stated pref group
  mutate(pref_prop = count/total) %>% 
  # Multiply proportion for each respondent by the total allocated visits within their
  # stated preference group, with number of visits adjusted to even out rounding
  mutate(wsj_count = case_when(med_pref == "MSNBC" ~ round(pref_prop * (msnbc_visits + 102)),
                               med_pref == "Fox" ~ round(pref_prop * (fox_visits - 2))))

c(fox_visits, msnbc_visits)
resp_visits %>% 
  group_by(med_pref) %>% 
  summarise(sum = sum(wsj_count))
  # Number of allocated visits follows expected pattern (with one additional site
  # visit, again conservatively, allocated to Fox)

# Create new rows for web data, corresponding to each respondents' WSJ site visits
  # Note: code takes several minutes to run!
new_data <- map(resp_visits$resp_id, function(.x){
  # Determine number of rows to allocate for each respondent
  n_visits <- resp_visits %>% 
    filter(resp_id == .x) %>% 
    pull(wsj_count)
  
  # Extract a row of their data from the web dataset, to keep same metadata
  samp_row <- web %>% 
    filter(resp_id == .x) %>% 
    head(1)
  
  # Transform into mock row of data corresponding to WSJ
  samp_row <- samp_row %>% 
    mutate(domain = "wsj.com", domain_recode = "wsj.com",
           avg_align = align_wsj)
  
  as.data.frame(lapply(samp_row, rep, n_visits))
}) %>% bind_rows()
nrow(new_data)

# Bind simulated rows to the original dataset
web_df <- web %>% 
  bind_rows(new_data) %>% 
  select(resp_id, os, browser, timestamp, domain, domain_recode, avg_align,
         pid, ideo, med_pref)

# > Regenerate relative slant scores ----

# Calculate relative slant scores for each respondent, excluding portals but 
# including simulated WSJ visits
scores <- web_df %>% 
  filter(domain_recode %notin% c("msn.com","aol.com","google.com")) %>%
  group_by(resp_id) %>% 
  summarise(score_wsj = mean(avg_align, na.rm = T)) 

# Merge scores into survey data
srvy <- srvy %>% 
  left_join(scores, by = "resp_id")

# > Descriptive results ----

# Figure S1(a): distribution of respondent-level slant, with portals removed but
# without the simulated WSJ visits (original scores)
s1_a <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), 
                    x_var = "score_noportals", x_label = "Respondent Avg. Slant Score\n(Original Scores)",
                    y_label = "Stated Media Preference")

ggsave(s1_a, path = plot_path, filename = "fig_s1_a.pdf",
       height= 3, width = 4.5, dpi = 600)

# Figure S1(b): distribution of respondent-level slant, with portals removed and
# with simulated WSJ visits added
s1_b <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), 
                    x_var = "score_wsj", x_label = "Respondent Avg. Slant Score\n(With WSJ Visits)",
                    y_label = "Stated Media Preference")

ggsave(s1_b, path = plot_path, filename = "fig_s1_b.pdf",
       height= 3, width = 4.5, dpi = 600)

# Figure S2(a): distribution of site-level slant, with portals removed (original scores)
web_noport <- web %>% 
  filter(domain_recode %notin% c("msn.com", "aol.com", "google.com")) # Original

s2_a <- visit_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"),
                    x_label = "Relative Slant of News Visits (Original Scores)",
                    y_label = "Stated Media Preference", weights = FALSE, 
                    df = web_noport, wsj = T,
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com","wsj.com"))

ggsave(s2_a, path = plot_path, filename = "fig_s2_a.pdf",
       height= 3.5, width = 8, dpi = 600)

# Figure S2(b): distribution of site-level slant, with portals removed and with
# simulated WSJ visits added
web_df_noport <- web_df %>% 
  filter(domain_recode %notin% c("msn.com", "aol.com", "google.com")) # With WSJ

s2_b <- visit_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"),
                    x_label = "Relative Slant of News Visits (With WSJ Visits)",
                    y_label = "Stated Media Preference", weights = FALSE, 
                    df = web_df_noport, wsj = T,
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com","wsj.com"))

ggsave(s2_b, path = plot_path, filename = "fig_s2_b.pdf",
       height= 3.5, width = 8, dpi = 600)

# Overlapping coefficient for respondent-level scores
overlap_medpref <- score_overlap(var = "score_wsj", group_var = "med_pref", 
                                 values = c("MSNBC", "Entertainment", "Fox"),
                                 out_contrasts = c("MSNBC vs. Fox",
                                                   "MSNBC vs. Entertainment",
                                                   "Fox vs. Entertainment"))

(wsj_resp_overlap <- overlap_medpref$`MSNBC vs. Fox`)

# Overlapping coefficient for visit-level scores
visit_overlap_medpref <- visit_overlap(group_var = "med_pref", df = web_df_noport,
                                       values = c("MSNBC", "Entertainment", "Fox"),
                                       out_contrasts = c("MSNBC vs. Fox",
                                                         "MSNBC vs. Entertainment",
                                                         "Fox vs. Entertainment"))

(wsj_site_overlap <- visit_overlap_medpref$`MSNBC vs. Fox`)

# > Experimental results ----

# For this section, just reverse the procedure above and instead allocate 90%
# of the WSJ visits to MSNBC preferrers and 10% to Fox preferrers to try to
# induce movement between revealed preference bins

# Reverse the allocation of site visits, such that MSNBC preferrers now get
# 90% of simulated site visits and Fox preferrers get 10%, with some adjustments
# to matched visits to account for rounding
msnbc_visits_rev <- fox_visits 
fox_visits_rev <- msnbc_visits

resp_visits <- resp_visits %>% 
  mutate(wsj_count_rev = case_when(med_pref == "MSNBC" ~ round(pref_prop * (msnbc_visits_rev + 11)),
                                   med_pref == "Fox" ~ round(pref_prop * (fox_visits_rev + 24))))

fox_visits_rev; msnbc_visits_rev
resp_visits %>% 
  group_by(med_pref) %>% 
  summarise(sum(wsj_count_rev))
  # Allocate slightly more WSJ visits to MSNBC (given rounding) to be conservative

# Create new rows for web data, corresponding to each respondents' WSJ site visits
  # Note: code takes several minutes to run!
new_data_rev <- map(resp_visits$resp_id, function(.x){
  # Determine number of rows to allocate for each respondent
  n_visits <- resp_visits %>% 
    filter(resp_id == .x) %>% 
    pull(wsj_count_rev)
  
  # Extract a row of their data from the web dataset, to keep same metadata
  samp_row <- web %>% 
    filter(resp_id == .x) %>% 
    head(1)
  
  # Transform into mock row of data corresponding to WSJ
  samp_row <- samp_row %>% 
    mutate(domain = "wsj.com", domain_recode = "wsj.com",
           avg_align = align_wsj)
  
  as.data.frame(lapply(samp_row, rep, n_visits))
}) %>% bind_rows()
nrow(new_data_rev)

# Bind simulated rows to the original dataset
web_rev <- web %>% 
  bind_rows(new_data_rev) %>% 
  select(resp_id, os, browser, timestamp, domain, domain_recode, avg_align,
         pid, ideo, med_pref)

# Calculate relative slant scores for each respondent, excluding portals but 
# including simulated WSJ visits
scores_rev <- web_rev %>% 
  filter(domain_recode %notin% c("msn.com","aol.com","google.com")) %>%
  group_by(resp_id) %>% 
  summarise(score_wsj_rev = mean(avg_align, na.rm = T)) 

# Merge scores into survey data
srvy <- srvy %>% 
  left_join(scores_rev, by = "resp_id")

# Code respondents into groups, based on how average score compares to exemplar sites
align_scores <- web %>% 
  group_by(domain_recode) %>% 
  summarise(avg_align = mean(avg_align)) 

cnn <- align_scores %>% filter(domain_recode == "cnn.com") %>% pull(avg_align)
yahoo <- align_scores %>% filter(domain_recode == "news.yahoo.com") %>% pull(avg_align)

srvy <- srvy %>% 
  mutate(score_code_rev = case_when(is.na(score_wsj_rev) ~ NA_real_,
                                    score_wsj_rev < cnn ~ 1,
                                    score_wsj_rev < yahoo ~ 2, TRUE ~ 3),
         score_code_orig = case_when(is.na(score_noportals) ~ NA_real_,
                                     score_noportals < cnn ~ 1,
                                     score_noportals < yahoo ~ 2, TRUE ~ 3))

# Compare original group size to version with WSJ visits
srvy %>% 
  filter(forcedchoice == 1) %>% 
  select(score_code_orig, score_code_rev) %>% 
  table()

# Set theme for plotting
theme_set(theme_bw() + 
            theme(legend.position = "bottom",
                  plot.title = element_text(hjust = 0.5, face = "bold",size = 16),
                  plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
                  axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"),
                                              face = "bold", size = 12, angle = 0, hjust = 0.5),
                  axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), size = 12),
                  legend.title = element_text(face = "bold", hjust = 0.5, size = 12),
                  legend.text = element_text(hjust = 0.5, size = 10),
                  axis.text.y = element_text(size = 10, color = "black"),
                  legend.box = "vertical",
                  legend.background = element_blank(),
                  legend.box.background = element_rect(colour = "black"),
                  text=element_text(colour="black", 
                                    size=15)))

# Format labels for plotting
code_labels <- gen_ranges(bin_var = "score_code_rev", score_version = "score_wsj_rev",
                          parentheses = TRUE)
code_labels <- paste(c("More Liberal\nThan CNN\n", "Between CNN\nand Yahoo!\n", 
                       "More Conserv.\nThan Yahoo!\n"), 
                     code_labels, sep = "")

# Estimate persuasion effects within newly defined groups
score_vsent_code <- group_vsent_plot(var = "score_code_rev", nbins = 3, 
                                     labels = code_labels,
                                     weights = FALSE)

# Figure S3: treatment effects 
(s3 <- ggplot(na.omit(score_vsent_code %>% filter(id != "Fox vs.\nMSNBC")), 
                           aes(x=factor(val),
                               col = factor(id, levels = c("Fox vs.\nEntertainment",
                                                           "MSNBC vs.\nEntertainment")),
                               shape = factor(id, levels = c("Fox vs.\nEntertainment",
                                                             "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", color = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) +
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(score_vsent_code$bin)) + 
    xlab("Relative Slant of News Consumption (Including WSJ Visits)") +
    ylab("Average Treatment Effect of\nPartisan Media vs. Entertainment") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels(break_val = 0.2, by_val = 0.1)$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels(break_val = 0.2, by_val = 0.1)$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) + 
    theme(axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, color = "black")))

ggsave(s3, path = plot_path, filename = "fig_s3.pdf", 
       width=9, height=5.25, dpi = 600)

# Sensitivity to financial domains - 50K visits ----

# Now, assume that WSJ was more widely visited in our data than in the Guess (2021)
# data, setting the number of domain visits to 50,000
n_visits <- 50000

# > Allocate visits by stated preferences ----

# Assume same 90/10 split of WSJ to Fox/MSNBC preferrers
msnbc_visits <- round(n_visits * 0.1)
fox_visits <- round(n_visits * 0.9)

msnbc_visits + fox_visits; n_visits 

# Determine number of WSJ visits (of 50K) to allocate to each respondent
resp_visits <- web %>% 
  filter(med_pref != "Entertainment") %>% 
  group_by(resp_id) %>% 
  summarise(med_pref = unique(med_pref),
            count = n()) %>%  # N of news visits for each respondent
  left_join(web %>% 
              filter(med_pref != "Entertainment") %>% 
              group_by(med_pref) %>% 
              summarise(total = n())) %>% # N of news visits for each stated pref group
  mutate(pref_prop = count/total) %>% 
  # Multiply proportion for each respondent by the total allocated visits within their
  # stated preference group, with number of visits adjusted to even out rounding
  mutate(wsj_count = case_when(med_pref == "MSNBC" ~ round(pref_prop * (msnbc_visits + 9)),
                               med_pref == "Fox" ~ round(pref_prop * (fox_visits + 12))))

c(fox_visits, msnbc_visits)
resp_visits %>% 
  group_by(med_pref) %>% 
  summarise(sum = sum(wsj_count))

# Create new rows for web data, corresponding to each respondents' WSJ site visits
# Note: code takes several minutes to run!
new_data_50k <- map(resp_visits$resp_id, function(.x){
  # Determine number of rows to allocate for each respondent
  n_visits <- resp_visits %>% 
    filter(resp_id == .x) %>% 
    pull(wsj_count)
  
  # Extract a row of their data from the web dataset, to keep same metadata
  samp_row <- web %>% 
    filter(resp_id == .x) %>% 
    head(1)
  
  # Transform into mock row of data corresponding to WSJ
  samp_row <- samp_row %>% 
    mutate(domain = "wsj.com", domain_recode = "wsj.com",
           avg_align = align_wsj)
  
  as.data.frame(lapply(samp_row, rep, n_visits))
}) %>% bind_rows()
nrow(new_data_50k)

# Bind simulated rows to the original dataset
web_df_50k <- web %>% 
  bind_rows(new_data_50k) %>% 
  select(resp_id, os, browser, timestamp, domain, domain_recode, avg_align,
         pid, ideo, med_pref)

# > Regenerate relative slant scores ----

# Calculate relative slant scores for each respondent, excluding portals but 
# including simulated WSJ visits
scores_50k <- web_df_50k %>% 
  filter(domain_recode %notin% c("msn.com","aol.com","google.com")) %>%
  group_by(resp_id) %>% 
  summarise(score_wsj_50k = mean(avg_align, na.rm = T)) 

# Merge scores into survey data
srvy <- srvy %>% 
  left_join(scores_50k, by = "resp_id")

# > Descriptive results ----

# Figure S4(a) is identical to Figure S1(a), generated above. Figure S5(a) is
# also identical to Figure S2(a).

# Figure S4(b): distribution of respondent-level slant, with portals removed and
# with 50K simulated WSJ visits added
s4_b <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), 
                    x_var = "score_wsj_50k", x_label = "Respondent Avg. Slant Score\n(With 50K WSJ Visits)",
                    y_label = "Stated Media Preference")

ggsave(s4_b, path = plot_path, filename = "fig_s4_b.pdf",
       height= 3, width = 4.5, dpi = 600)

# Figure S5(b): distribution of site-level slant, with portals removed and with
# 50K simulated WSJ visits added
web_df_noport_50k <- web_df_50k %>% 
  filter(domain_recode %notin% c("msn.com", "aol.com", "google.com")) # With WSJ

s5_b <- visit_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"),
                    x_label = "Relative Slant of News Visits (With 50K WSJ Visits)",
                    y_label = "Stated Media Preference", weights = FALSE, 
                    df = web_df_noport_50k, wsj = T,
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com","wsj.com"))

ggsave(s5_b, path = plot_path, filename = "fig_s5_b.pdf",
       height= 3.5, width = 8, dpi = 600)

# Overlapping coefficient for respondent-level scores
overlap_medpref_50k <- score_overlap(var = "score_wsj_50k", group_var = "med_pref", 
                                 values = c("MSNBC", "Entertainment", "Fox"),
                                 out_contrasts = c("MSNBC vs. Fox",
                                                   "MSNBC vs. Entertainment",
                                                   "Fox vs. Entertainment"))

(wsj_resp_overlap_50k <- overlap_medpref_50k$`MSNBC vs. Fox`)

# Overlapping coefficient for visit-level scores
visit_overlap_medpref_50k <- visit_overlap(group_var = "med_pref", df = web_df_noport_50k,
                                       values = c("MSNBC", "Entertainment", "Fox"),
                                       out_contrasts = c("MSNBC vs. Fox",
                                                         "MSNBC vs. Entertainment",
                                                         "Fox vs. Entertainment"))

(wsj_site_overlap_50k <- visit_overlap_medpref_50k$`MSNBC vs. Fox`)

# > Experimental results ----

# For this section, again reverse the procedure above and instead allocate 90%
# of the WSJ visits to MSNBC preferrers and 10% to Fox preferrers to try to
# induce movement between revealed preference bins

# Reverse the allocation of site visits, such that MSNBC preferrers now get
  # 90% of simulated site visits and Fox preferrers get 10%, with some adjustments
  # to matched visits to account for rounding
msnbc_visits_rev <- fox_visits 
fox_visits_rev <- msnbc_visits

resp_visits <- resp_visits %>% 
  mutate(wsj_count_rev = case_when(med_pref == "MSNBC" ~ round(pref_prop * (msnbc_visits_rev - 3)),
                                   med_pref == "Fox" ~ round(pref_prop * (fox_visits_rev + 1))))

fox_visits_rev; msnbc_visits_rev
resp_visits %>% 
  group_by(med_pref) %>% 
  summarise(sum(wsj_count_rev))
  # Allocate slightly more WSJ visits to MSNBC (given rounding) to be conservative

# Create new rows for web data, corresponding to each respondents' WSJ site visits
  # Note: code takes several minutes to run!
new_data_rev_50k <- map(resp_visits$resp_id, function(.x){
  # Determine number of rows to allocate for each respondent
  n_visits <- resp_visits %>% 
    filter(resp_id == .x) %>% 
    pull(wsj_count_rev)
  
  # Extract a row of their data from the web dataset, to keep same metadata
  samp_row <- web %>% 
    filter(resp_id == .x) %>% 
    head(1)
  
  # Transform into mock row of data corresponding to WSJ
  samp_row <- samp_row %>% 
    mutate(domain = "wsj.com", domain_recode = "wsj.com",
           avg_align = align_wsj)
  
  as.data.frame(lapply(samp_row, rep, n_visits))
}) %>% bind_rows()
nrow(new_data_rev_50k)

# Bind simulated rows to the original dataset
web_rev_50k <- web %>% 
  bind_rows(new_data_rev_50k) %>% 
  select(resp_id, os, browser, timestamp, domain, domain_recode, avg_align,
         pid, ideo, med_pref)

# Calculate relative slant scores for each respondent, excluding portals but 
# including simulated WSJ visits
scores_rev_50k <- web_rev_50k %>% 
  filter(domain_recode %notin% c("msn.com","aol.com","google.com")) %>%
  group_by(resp_id) %>% 
  summarise(score_wsj_rev_50k = mean(avg_align, na.rm = T)) 

# Merge scores into survey data
srvy <- srvy %>% 
  left_join(scores_rev_50k, by = "resp_id")

# Code respondents into groups, based on how average score compares to exemplar sites
srvy <- srvy %>% 
  mutate(score_code_rev_50k = case_when(is.na(score_wsj_rev_50k) ~ NA_real_,
                                    score_wsj_rev_50k < cnn ~ 1,
                                    score_wsj_rev_50k < yahoo ~ 2, TRUE ~ 3))

# Compare original group size to version with WSJ visits
srvy %>% 
  filter(forcedchoice == 1) %>% 
  select(score_code_orig, score_code_rev_50k) %>% 
  table()

# Format labels for plotting
code_labels_50k <- gen_ranges(bin_var = "score_code_rev_50k", score_version = "score_wsj_rev_50k",
                          parentheses = TRUE)
code_labels_50k <- paste(c("More Liberal\nThan CNN\n", "Between CNN\nand Yahoo!\n", 
                       "More Conserv.\nThan Yahoo!\n"), 
                       code_labels_50k, sep = "")

# Estimate persuasion effects within newly defined groups
score_vsent_code_50k <- group_vsent_plot(var = "score_code_rev_50k", nbins = 3, 
                                         labels = code_labels_50k,
                                         weights = FALSE)

# Figure S6: treatment effects 
(s6 <- ggplot(na.omit(score_vsent_code_50k %>% filter(id != "Fox vs.\nMSNBC")), 
              aes(x=factor(val),
                  col = factor(id, levels = c("Fox vs.\nEntertainment",
                                              "MSNBC vs.\nEntertainment")),
                  shape = factor(id, levels = c("Fox vs.\nEntertainment",
                                                "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", color = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) +
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(score_vsent_code_50k$bin)) + 
    xlab("Relative Slant of News Consumption (Including 50K WSJ Visits)") +
    ylab("Average Treatment Effect of\nPartisan Media vs. Entertainment") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels(break_val = 0.2, by_val = 0.1)$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels(break_val = 0.2, by_val = 0.1)$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) + 
    theme(axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, color = "black")))

ggsave(s6, path = plot_path, filename = "fig_s6.pdf", 
       width=9, height=5.25, dpi = 600)
